A study of Asian Religious and Biblical Texts

In this dataset there is following books:
* Upanishads - are ancient Sanskrit texts of spiritual teaching and ideas of Hinduism. They are the part of the oldest scriptures of Hinduism, the Vedas, that deal with meditation, philosophy, and spiritual knowledge; other parts of the Vedas deal with mantras, benedictions, rituals, ceremonies, and sacrifices.

Old Testament :

More on wikipedia

**in Indian literary traditions refers to an aphorism or a collection of aphorisms in the form of a manual or, more broadly, a condensed manual or text. Sutras are a genre of ancient and medieval Indian texts found in Hinduism, Buddhism and Jainism.*

EDA

Looking at data

Quick look at the data size:

cat('Number of features:', ncol(data))
## Number of features: 8267
cat('Number of records:', nrow(data))
## Number of records: 590
cat('Number of words in total:', sum(data[-1]))
## Number of words in total: 60609

To give some perspective polish classic ‘Pan Tadeusz’ has 68 682 words in total.

knitr::kable(
  data[1:10, 1:10], caption = 'Dataset',
  booktabs = TRUE
) %>% 
  kable_styling()
Dataset
X foolishness hath wholesome takest feelings anger vaivaswata matrix kindled
Buddhism_Ch1 0 0 0 0 0 0 0 0 0
Buddhism_Ch2 0 0 0 0 0 0 0 0 0
Buddhism_Ch3 0 0 0 0 0 0 0 0 0
Buddhism_Ch4 0 0 0 0 0 0 0 0 0
Buddhism_Ch5 0 0 0 0 0 0 0 0 0
Buddhism_Ch6 0 0 0 0 0 0 0 0 0
Buddhism_Ch7 0 0 0 0 0 0 0 0 0
Buddhism_Ch8 0 0 0 0 0 0 0 0 0
Buddhism_Ch9 0 0 0 0 0 0 0 0 0
Buddhism_Ch10 0 0 0 0 0 0 0 0 0

Looking at data we are certain that there is no sense in keeping chapters separated. We than used stringi package to extract names of books. We figured that we combine biblical texts into one as they have significantly less chapters than the rest. Than we truncated it to have only one book per row (word occurances were summed). We ended up with this dataframe:

book_name <- stri_extract(data$X, regex =  "^[a-zA-Z]+")
book_name <- ifelse(startsWith(book_name, "Bo"), "Bible",book_name)
data$book_name <- book_name
data <- data[,-1]
book_names <- unique(data$book_name)

df <- matrix(0, length(book_names), ncol = ncol(data)-1)
for (i in seq_along(book_names)){
  row <- colSums(data[data$book_name == book_names[i],1:(ncol(data)-1)])
  df[i,] <- row
}

df <- as.data.frame(df)

df <- cbind(book_names,df)
colnames(df) <- c( "book_name", colnames(data[,1:(ncol(data)-1)]))
m <- ncol(df)

knitr::kable(
  df[1:5, 1:10], caption = 'Dataset',
  booktabs = TRUE
) %>% 
  kable_styling()
Dataset
book_name foolishness hath wholesome takest feelings anger vaivaswata matrix kindled
Buddhism 0 0 0 0 19 0 0 0 0
TaoTeChing 0 0 0 0 0 1 0 0 0
Upanishad 0 0 0 0 0 3 1 0 1
YogaSutra 0 2 1 0 0 0 0 1 0
Bible 2 332 3 1 0 31 0 0 3

It is already better for visualization.

Visualization

Most common words per book

for (bn in book_names){
tmp <- sort(df[df$book_name == bn, 2:m], decreasing = T)
barplot(height = unlist(tmp[10:1]),
        las =2 ,
        horiz = TRUE,
        main = paste("Most frequent words in", bn),
        cex.names=0.7,
        col = "lightblue")
}

More interesting way to visualize words is word clouds

TeoTeChing

bn <- "TaoTeChing"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Bible

bn <-  "Bible" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Buddhism

bn <-  "Buddhism" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Upnishad

bn <-  "Upanishad"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=80, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

YogaSutra

bn <-  "YogaSutra" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

How our chapters look categorized in books look treated with TSNE

library(Rtsne)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
tsne <- Rtsne(data[,1:8266], dims = 2, preplexity = 30,  verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.18 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 60.478213 (50 iterations in 0.39 seconds)
## Iteration 100: error is 60.305248 (50 iterations in 0.39 seconds)
## Iteration 150: error is 59.533257 (50 iterations in 0.31 seconds)
## Iteration 200: error is 59.502624 (50 iterations in 0.26 seconds)
## Iteration 250: error is 59.494416 (50 iterations in 0.11 seconds)
## Iteration 300: error is 1.123140 (50 iterations in 0.12 seconds)
## Iteration 350: error is 1.055831 (50 iterations in 0.10 seconds)
## Iteration 400: error is 1.038577 (50 iterations in 0.10 seconds)
## Iteration 450: error is 1.030946 (50 iterations in 0.10 seconds)
## Iteration 500: error is 1.024539 (50 iterations in 0.11 seconds)
## Fitting performed in 1.99 seconds.
data_to_plot <- as.data.frame(tsne$Y)

data_to_plot$label <- book_name

ggplot(data_to_plot, aes(x = V1, y = V2, color = label)) +
  geom_point() + 
  theme_bw() + 
  scale_color_manual(values = brewer.pal(8, "Set1"))

And how they look in 3d

tsne <- Rtsne(data[,1:8266], dims = 3, preplexity = 30,  verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.16 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 59.826617 (50 iterations in 0.88 seconds)
## Iteration 100: error is 59.752607 (50 iterations in 0.76 seconds)
## Iteration 150: error is 59.515524 (50 iterations in 0.61 seconds)
## Iteration 200: error is 59.504687 (50 iterations in 0.48 seconds)
## Iteration 250: error is 59.515569 (50 iterations in 0.40 seconds)
## Iteration 300: error is 0.977989 (50 iterations in 0.38 seconds)
## Iteration 350: error is 0.898545 (50 iterations in 0.30 seconds)
## Iteration 400: error is 0.872014 (50 iterations in 0.23 seconds)
## Iteration 450: error is 0.861557 (50 iterations in 0.36 seconds)
## Iteration 500: error is 0.854472 (50 iterations in 0.26 seconds)
## Fitting performed in 4.66 seconds.
data_to_plot <- as.data.frame(tsne$Y)

data_to_plot$label <- book_name

plot_ly(data_to_plot, x = ~V1, y = ~V2, z = ~V3, color = ~label, size = 0.1)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

word lengths

d <- data.frame(word_len = NULL, book = NULL)

for (bn in book_names){
tmp_df <- df[df$book_name == bn,]
word_list <- sapply(colnames(tmp_df)[2:ncol(tmp_df)], function(x) rep(nchar(x), tmp_df[x])  )
word_list <- unlist(word_list) 
names(word_list) <- NULL
p <- data.frame(word_len = word_list, book = rep(bn, length(word_list)))
d <- rbind(d, p)

}

ggplot(d, aes(x = word_len, fill = book)) + geom_density(adjust = 2, alpha = 0.5 ) + theme_minimal()

ggplot(d, aes(y = word_len, x= book,  fill = book)) + geom_boxplot() + theme_minimal()

#Abnormally long words

w <- colnames(data)

long <- parallelLapply(w, function(x){if(nchar(x)>15){x}else NULL})
long <- unlist(long)
long
##            noseconsciousness             distressdistress 
##          "noseconsciousness"           "distressdistress" 
##    neitherpainfulnorpleasant            contradistinction 
##  "neitherpainfulnorpleasant"          "contradistinction" 
##           clingingsustenance           clingingaggregates 
##         "clingingsustenance"         "clingingaggregates" 
##            clingingaggregate             incomprehensible 
##          "clingingaggregate"           "incomprehensible" 
##             allconsciousness             eyeconsciousness 
##           "allconsciousness"           "eyeconsciousness" 
##            selfcomprehension   consciousnessconsciousness 
##          "selfcomprehension" "consciousnessconsciousness" 
##       neitherpleasurenorpain            bodyconsciousness 
##     "neitherpleasurenorpain"          "bodyconsciousness" 
##         argumentativethought             senseimpressions 
##       "argumentativethought"           "senseimpressions" 
##           stressfulsariputta             intellectcontact 
##         "stressfulsariputta"           "intellectcontact" 
##            selfconsciousness            fabricationverbal 
##          "selfconsciousness"          "fabricationverbal" 
##             becomingbecoming             earconsciousness 
##           "becomingbecoming"           "earconsciousness" 
##             undifferentiated             threedimensional 
##           "undifferentiated"           "threedimensional" 
##     fabricationsfabrications             allcomprehending 
##   "fabricationsfabrications"           "allcomprehending" 
##            consciousnesshood       intellectconsciousness 
##          "consciousnesshood"     "intellectconsciousness" 
##             selfreproductive    neitherpleasantnorpainful 
##           "selfreproductive"  "neitherpleasantnorpainful" 
##       lamentationlamentation            soulconsciousness 
##     "lamentationlamentation"          "soulconsciousness" 
##          tongueconsciousness            propertysariputta 
##        "tongueconsciousness"          "propertysariputta" 
##   clingingclingingsustenance           fabricationsmental 
## "clingingclingingsustenance"         "fabricationsmental"
d <- data[,long] %>% colSums() %>% as.data.frame() 
d$names <- rownames(d)
colnames(d)[1] <- "occurrances"

d %>% arrange(desc(occurrances))
##    occurrances                      names
## 1           16          clingingaggregate
## 2           13         clingingaggregates
## 3            8         clingingsustenance
## 4            6  neitherpainfulnorpleasant
## 5            5          selfconsciousness
## 6            3  neitherpleasantnorpainful
## 7            2          noseconsciousness
## 8            2           incomprehensible
## 9            2           allconsciousness
## 10           2           eyeconsciousness
## 11           2     neitherpleasurenorpain
## 12           2          bodyconsciousness
## 13           2           intellectcontact
## 14           2           earconsciousness
## 15           2           undifferentiated
## 16           2     intellectconsciousness
## 17           2        tongueconsciousness
## 18           1           distressdistress
## 19           1          contradistinction
## 20           1          selfcomprehension
## 21           1 consciousnessconsciousness
## 22           1       argumentativethought
## 23           1           senseimpressions
## 24           1         stressfulsariputta
## 25           1          fabricationverbal
## 26           1           becomingbecoming
## 27           1           threedimensional
## 28           1   fabricationsfabrications
## 29           1           allcomprehending
## 30           1          consciousnesshood
## 31           1           selfreproductive
## 32           1     lamentationlamentation
## 33           1          soulconsciousness
## 34           1          propertysariputta
## 35           1 clingingclingingsustenance
## 36           1         fabricationsmental

At Savatthi. There the Blessed One said, "Monks, there are these five clinging-aggregates. Which five? Form as a clinging-aggregate, feeling as a clinging-aggregate, perception as a clinging-aggregate, fabrications as a clinging-aggregate, consciousness as an a clinging-aggregate.

Sentiment

The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.

get_words_out_of_bag <- function(df,n){
  #' df - bag of words
  #' n number of row to get words out
  row <- df[n,] %>% select(which(df[n,] != 0))
  cols <- colnames(row)
  if(length(cols) == 1){return(list(NULL))}
  words <- sapply(2:length(cols), function(i, row, cols){rep(cols[i], row[1,i])}, cols = cols, row=row, simplify = TRUE)
  unlist(words)
}

get_words_in_books <- function(df, bookname){
  slice <- df[df[,1] == bookname,]
  all <- sapply(1:nrow(slice), get_words_out_of_bag, df=df, simplify = TRUE)
  unlist(all)
}
parallelStartMulticore(detectCores())
## Starting parallelization in mode=multicore with cpus=4.
df <- read.csv('./AllBooks_baseline_DTM_Labelled.csv')
books <- stri_extract(df$X, regex =  "^[a-zA-Z]+")
df$X <- books

starttime <- Sys.time()
inside <- parallelLapply(unique(books), get_words_in_books, df=df)
## Mapping in parallel: mode = multicore; level = NA; cpus = 4; elements = 8.
endtime <- Sys.time()
endtime - starttime
## Time difference of 3.746424 mins
select_sentiment <- function(word, sent_name){
  #' fukncja zwraca liste posortowanych slow o danym sentymencie
  #' @param word lista slow do analizy
  #' @param sent_name nazwa sentymentu, ktory chcemy otrzymac
  
  require('tidytext')
  require('textdata')
  # wynieramy slowa ze slownika sentymentu
  nrc_sent <- get_sentiments("nrc") %>% 
    filter(sentiment == sent_name)
  
  #zliczamy w danych wystapienia tych slow
  as.data.frame(word) %>%
    inner_join(nrc_sent) %>%
    count(word, sort = TRUE) %>%
    mutate(sent = sent_name)
}

all_sentiment <- function(word, k = 100, all = FALSE, sent = c("trust", 
                                                               "fear", 
                                                               "negative", 
                                                               "sadness",
                                                               "anger", 
                                                               "surprise",
                                                               "positive", 
                                                               "disgust",
                                                               "anticipation", 
                                                               "joy")){
  #' zwraca k najbardziej emocjonalnych slow w zbiorze
  #' @param word lista slow
  #' @param k do head(k)
  #' @param all jesli TRUE zwroci wszystkie slowa niezaleznie od k
  #' @param sent zwroci tylko te sentymenty - domyslnie wszytkie
  
  require('dplyr')
  # wektoryzwujemy funkcje wyznaczajaca sentyment wzgledem sentymentu
  vek_s <- Vectorize(select_sentiment, vectorize.args = "sent_name", SIMPLIFY = FALSE)
  # lista macierzy kazdego sentymentu
  x <- vek_s(word, sent)
  # laczymy w jedna ramke
  y <- do.call(rbind, x)
  # sortujemy 
  y2 <- y %>% arrange(desc(n))
  # obcinamy do k
  if(!all) {
    y2<- head(y2, k)
  }
  return(y2)
}

sentiment_table_mixed <- function(word, name, sent = c("trust", 
                                                       "fear", 
                                                       "negative", 
                                                       "sadness",
                                                       "anger", 
                                                       "surprise",
                                                       "positive", 
                                                       "disgust",
                                                       "anticipation", 
                                                       "joy")) {
  #' zwraca jednokolumnowa tabelke z procentowa zawartoscia kazdego sentymentu
  #' @param word lista slow
  #' @param name nazwa zbioru(przypisana potem kolumnie wyjsciowej)
  
  require("dplyr")
  zliczone <- all_sentiment(word, all = TRUE, sent = sent)
  tab <- zliczone %>%
    group_by(sent) %>%
    summarise(n = sum(n))%>%
    mutate(procent = 100*n / sum(n))%>%
    select(procent)
  
  colnames(tab) <- c(name)
  return(tab)
}

booknames <- unique(books)
names(inside) <- booknames

df_sented_list <- parallelLapply(booknames, function(x, inside){sentiment_table_mixed(unlist(inside[x]), x)}, inside = inside)

df_sented <- do.call("cbind",df_sented_list)


df_sented <- cbind(df_sented, sent= c("trust", 
                     "fear", 
                     "negative", 
                     "sadness",
                     "anger", 
                     "surprise",
                     "positive", 
                     "disgust",
                     "anticipation", 
                     "joy"))


melted <- reshape::melt(df_sented)
rpivotTable(data = melted, cols = "sent", rows = "variable", rendererName = "Heatmap", aggregatorName = "Sum", vals = "value")

Let’s see with books of Bible treated as one:

df_sented_simp <- df_sented

df_sented_simp$Bible = (df_sented$BookOfEccleasiasticus + 
  df_sented$BookOfProverb + 
  df_sented$BookOfEcclesiastes +
  df_sented$BookOfWisdom) / 4

df_sented_simp$BookOfProverb <- NULL
df_sented_simp$BookOfEcclesiastes <- NULL
df_sented_simp$BookOfEccleasiasticus <- NULL
df_sented_simp$BookOfWisdom <- NULL


melted <- reshape::melt(df_sented_simp)
## Using sent as id variables
rpivotTable(data = melted, cols = "sent", rows = "variable", rendererName = "Heatmap", aggregatorName = "Sum", vals = "value")